Setup

# Import functions
root = "./"
source(file.path(root,"general_functions.R"))
import_parameters(params)
## **** __Utilized Cores__ **** = 4$subsetGenes
## [1] "protein_coding"
## 
## $subsetCells
## [1] "400"
## 
## $resolution
## [1] "0.2"
## 
## $resultsPath
## [1] "Results/subsetGenes-protein_coding__subsetCells-400__Resolution-0.2__perplexity-30__nCores-4"
## 
## $nCores
## [1] 4
## 
## $perplexity
## [1] "30"
load(file.path("scRNAseq_results.RData"))

Load Libraries

library(Seurat)
library(dplyr)
library(gridExtra)
library(knitr) 
library(plotly)
library(ggplot2)
library(viridis)
library(reshape2)
library(shiny) 
library(ggrepel)
library(DT) 
library(ComplexHeatmap); #BiocManager::install("ComplexHeatmap") 
  
## Install Bioconductor
#  if (!requireNamespace("BiocManager"))
#     install.packages("BiocManager") 
library(biomaRt) # BiocManager::install(c("biomaRt"))
# library(DESeq2) # BiocManager::install(c("DESeq2"))
library(enrichR) #BiocManager::install("enrichR")

library(monocle) #BiocManager::install("monocle")
# BiocManager::install("DelayedMatrixStats")
# BiocManager::install("org.Mm.eg.db") 
library(org.Hs.eg.db)
library(garnett) # devtools::install_github("cole-trapnell-lab/garnett") 
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] splines   stats4    parallel  grid      stats     graphics  grDevices
##  [8] utils     datasets  methods   base     
## 
## other attached packages:
##  [1] garnett_0.1.1         org.Hs.eg.db_3.7.0    AnnotationDbi_1.44.0 
##  [4] IRanges_2.16.0        S4Vectors_0.20.1      monocle_2.10.1       
##  [7] DDRTree_0.1.5         irlba_2.3.3           VGAM_1.0-6           
## [10] Biobase_2.42.0        BiocGenerics_0.28.0   enrichR_1.0          
## [13] biomaRt_2.38.0        ComplexHeatmap_1.20.0 ggrepel_0.8.0        
## [16] reshape2_1.4.3        viridis_0.5.1         viridisLite_0.3.0    
## [19] DT_0.5.1              shiny_1.2.0           plotly_4.8.0         
## [22] knitr_1.21            gridExtra_2.3         dplyr_0.7.8          
## [25] Seurat_2.3.4          Matrix_1.2-15         cowplot_0.9.4        
## [28] ggplot2_3.1.0        
## 
## loaded via a namespace (and not attached):
##   [1] snow_0.4-3           backports_1.1.3      circlize_0.4.5      
##   [4] Hmisc_4.2-0          plyr_1.8.4           igraph_1.2.2        
##   [7] lazyeval_0.2.1       densityClust_0.3     crosstalk_1.0.0     
##  [10] fastICA_1.2-1        digest_0.6.18        foreach_1.4.4       
##  [13] htmltools_0.3.6      lars_1.2             gdata_2.18.0        
##  [16] memoise_1.1.0        magrittr_1.5         checkmate_1.9.1     
##  [19] cluster_2.0.7-1      mixtools_1.1.0       ROCR_1.0-7          
##  [22] limma_3.38.3         matrixStats_0.54.0   docopt_0.6.1        
##  [25] R.utils_2.7.0        prettyunits_1.0.2    colorspace_1.4-0    
##  [28] blob_1.1.1           xfun_0.4             sparsesvd_0.1-4     
##  [31] crayon_1.3.4         RCurl_1.95-4.11      jsonlite_1.6        
##  [34] bindr_0.1.1          survival_2.43-3      zoo_1.8-4           
##  [37] iterators_1.0.10     ape_5.2              glue_1.3.0          
##  [40] gtable_0.2.0         GetoptLong_0.1.7     kernlab_0.9-27      
##  [43] shape_1.4.4          prabclus_2.2-7       DEoptimR_1.0-8      
##  [46] scales_1.0.0         pheatmap_1.0.12      mvtnorm_1.0-8       
##  [49] DBI_1.0.0            bibtex_0.4.2         Rcpp_1.0.0          
##  [52] metap_1.1            dtw_1.20-1           progress_1.2.0      
##  [55] xtable_1.8-3         htmlTable_1.13.1     reticulate_1.10     
##  [58] foreign_0.8-71       bit_1.1-14           proxy_0.4-22        
##  [61] mclust_5.4.2         SDMTools_1.1-221     Formula_1.2-3       
##  [64] tsne_0.1-3           htmlwidgets_1.3      httr_1.4.0          
##  [67] FNN_1.1.2.2          gplots_3.0.1.1       RColorBrewer_1.1-2  
##  [70] fpc_2.1-11.1         acepack_1.4.1        modeltools_0.2-22   
##  [73] ica_1.0-2            pkgconfig_2.0.2      XML_3.98-1.17       
##  [76] R.methodsS3_1.7.1    flexmix_2.3-14       nnet_7.3-12         
##  [79] tidyselect_0.2.5     labeling_0.3         rlang_0.3.1         
##  [82] later_0.8.0          munsell_0.5.0        tools_3.5.1         
##  [85] RSQLite_2.1.1        ggridges_0.5.1       evaluate_0.12       
##  [88] stringr_1.4.0        yaml_2.2.0           npsurv_0.4-0        
##  [91] bit64_0.9-7          fitdistrplus_1.0-14  robustbase_0.93-3   
##  [94] caTools_1.17.1.1     purrr_0.3.0          RANN_2.6.1          
##  [97] bindrcpp_0.2.2       pbapply_1.4-0        nlme_3.1-137        
## [100] mime_0.6             slam_0.1-44          R.oo_1.22.0         
## [103] hdf5r_1.0.1          compiler_3.5.1       rstudioapi_0.9.0    
## [106] png_0.1-7            lsei_1.2-0           tibble_2.0.1        
## [109] stringi_1.2.4        lattice_0.20-38      trimcluster_0.1-2.1 
## [112] HSMMSingleCell_1.2.0 pillar_1.3.1         combinat_0.0-8      
## [115] Rdpack_0.10-1        lmtest_0.9-36        GlobalOptions_0.1.0 
## [118] data.table_1.12.0    bitops_1.0-6         gbRd_0.4-11         
## [121] httpuv_1.4.5.1       R6_2.3.0             latticeExtra_0.6-28 
## [124] promises_1.0.1       KernSmooth_2.23-15   codetools_0.2-16    
## [127] MASS_7.3-51.1        gtools_3.8.1         assertthat_0.2.0    
## [130] rjson_0.2.20         withr_2.1.2          qlcMatrix_0.9.7     
## [133] hms_0.4.2            diptest_0.75-7       doSNOW_1.0.16       
## [136] rpart_4.1-13         tidyr_0.8.2          class_7.3-15        
## [139] rmarkdown_1.11       segmented_0.5-3.0    Rtsne_0.15          
## [142] base64enc_0.1-3
print(paste("Seurat ", packageVersion("Seurat")))
## [1] "Seurat  2.3.4"

Cluster Biomarkers

Seurat has several tests for differential expression which can be set with the test.use parameter (see the DE vignette for details). For example, the ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect).

Shown here: Biomarkers of each cluster vs. all other clusters.

Biomarkers Data

All Biomarkers

# Limit to only top variable genes?: 
# Set arg 'only.pos=F' to capture negative biomarkers
# DAT <- SetIdent(DAT, ident.use = "post_clustering") 

DAT.markers <- FindAllMarkers(object = DAT, min.pct = 0.25, thresh.use = 0.25,  only.pos = F,  
                              test.use = "wilcox") # DESeq2
## Cell group 2 is empty - no cells with identity class
## Warning in FindAllMarkers(object = DAT, min.pct = 0.25, thresh.use =
## 0.25, : No DE genes identified.
DAT.markers <- DAT.markers %>% mutate(FC = 2^avg_logFC)
## Error in mutate_impl(.data, dots): Evaluation error: object 'avg_logFC' not found.
DAT.markers.sig <- DAT.markers %>% subset(p_val_adj<=0.05) 
## Error in eval(e, x, parent.frame()): object 'p_val_adj' not found
markers.summary <- DAT.markers.sig %>% group_by(cluster) %>% tally()
## Error in eval(lhs, parent, parent): object 'DAT.markers.sig' not found
# markers.summary <- base::merge(DAT.markers.sig %>% group_by(cluster) %>% tally(),
#       DAT.markers %>% group_by(cluster) %>% summarise(mean(avg_logFC)), 
#       by="cluster" )

createDT(markers.summary, caption = "Number of DEGs and Mean logFC per Cluster")
## Error in crosstalk::is.SharedData(data): object 'markers.summary' not found
createDT(DAT.markers, caption = paste("All Biomarkers: All Clusters"))

Top Biomarkers

topNum = 5
topBiomarkers <- DAT.markers %>% group_by(cluster) %>% top_n(topNum, avg_logFC)
## Error in grouped_df_impl(data, unname(vars), drop): Column `cluster` is unknown
createDT(DAT.markers, caption = paste("All Biomarkers: All Clusters"))

Cluster Biomarker: Violin Plots

getTopBiomarker <- function(DAT.markers, clusterID, topN=1){
  df <-DAT.markers %>%
    subset(p_val_adj<0.05 & cluster==as.character(clusterID)) %>%
    arrange(desc(avg_logFC))
    top_pct_markers <- df[1:topN,"gene"]
  return(top_pct_markers)
}
# clust1_biomarkers <- getTopBiomarker(DAT.markers, clusterID=1, topN=2)
# clust2_biomarkers <- getTopBiomarker(DAT.markers, clusterID=2, topN=2)


### Plot biomarkers 
plotBiomarkers <- function(DAT, biomarkers, cluster){
  biomarkerPlots <- list()
  for (marker in biomarkers){ 
    p <- VlnPlot(object = DAT, features.plot = c(marker), y.log=T, return.plotlist=T) 
    biomarkerPlots[[marker]] <- p + ggplot2::aes(alpha=0.5) + xlab( "Cluster") + ylab( "Expression")
  }
  combinedPlot <- do.call(grid.arrange, c(biomarkerPlots, list(ncol=2, top=paste("Top DEG Biomarkers for Cluster",cluster))) ) 

  # biomarkerPlots <- lapply(biomarkers, function(marker) {
  #   VlnPlot(object = DAT, features.plot = c(marker), y.log=T, return.plotlist=T) %>% + ggplot2::ggtitle(marker) %>% ggplotly() 
  # })    
  # return(subplot(biomarkerPlots) )
}   

top1 <- DAT.markers %>% group_by(cluster) %>% top_n(1, avg_logFC) 
## Error in grouped_df_impl(data, unname(vars), drop): Column `cluster` is unknown
nCols <- floor( sqrt(length(unique(top1$cluster))) )   
## Error in unique(top1$cluster): object 'top1' not found
figHeight <- nCols *7
## Error in eval(expr, envir, enclos): object 'nCols' not found
# Plot top 2 biomarker genes for each 
for (clust in unique(DAT.markers$cluster)){ 
   cat('\n')   
   cat("### Cluster ",clust,"\n") 
   biomarkers <- getTopBiomarker(DAT.markers, clusterID=clust, topN=2)
   plotBiomarkers(DAT, biomarkers, clust)  
   cat('\n')   
} 

Cluster Biomarker: Volcano Plots

##Construct the plot object
volcanoPlot <- function(DEG_df, caption="", topFC_labeled=5){
  DEG_df$sig<-  ifelse( DEG_df$p_val_adj<0.05 & DEG_df$avg_logFC<1.5, "p_val_adj<0.05",
            ifelse( DEG_df$p_val_adj<0.05  & DEG_df$avg_logFC>1.5, "p_val_adj<0.05 & avg_logFC>1.5",
                "p_val_adj>0.05"
        )) 
  DEG_df <- arrange(DEG_df, desc(sig))
  yMax  <- max(-log10(DEG_df$p_val_adj)) + max(-log10(DEG_df$p_val_adj))/3 #ifelse(max(-log10(DEG_df$p_val_adj))<45, 50, max(-log10(DEG_df$p_val_adj)) + 10)
  
  vol <- ggplot(data=DEG_df, aes(x=avg_logFC, y= -log10(p_val_adj))) +
    geom_point(alpha=0.5, size=3, aes(col=sig)) + 
    scale_color_manual(values=list("p_val_adj<0.05"="turquoise3",
                                   "p_val_adj<0.05 & avg_logFC>1.5"="purple", 
                                   "p_val_adj>0.05" = "darkgray")) +
    theme(legend.position = "none") + 
    xlab(expression(paste("Average ",log^{2},"(fold change)"))) +
    ylab(expression(paste(-log^{10},"(p-value)"))) + xlim(-2,2) + ylim(0, yMax) +
    ## ggrepl labels
    geom_text_repel(data= arrange(DEG_df,  p_val_adj, desc(avg_logFC))[1:topFC_labeled,], 
                    # filter(DEG_df, avg_logFC>=1.5)[1:10,],
                    aes(label=gene),  color="black", alpha=.5,
                    segment.color="black", segment.alpha=.5  
                    ) +  
    # Lines
    geom_vline(xintercept= -1.5,lty=4, lwd=.3, alpha=.5) + 
    geom_vline(xintercept= 1.5,lty=4, lwd=.3, alpha=.5) +
    geom_hline(yintercept= -log10(0.05),lty=4, lwd=.3, alpha=.5) + 
    ggtitle(caption) 
  print(vol)
}


# Run plots
for (clust in unique(DAT.markers$cluster)){
   cat('\n')   
   cat("### Cluster ",clust,": Volcano","\n") 
   cap <- paste("Cluster",clust,"DEG Table") 
   DEG_df <- subset(DAT.markers, cluster==as.character(clust)) %>% arrange(desc(avg_logFC))  
   volcanoPlot(DEG_df, caption = cap)
   createDT_html(DEG_df, caption = cap)
   cat('\n')   
} 

Top Biomarker Plot

Biomarkers GO

for (clust in top1$cluster){ 
  subClust <- subset( top1, cluster==clust)
  cat('\n')
  cat("### Cluster",clust,"\n")
  cat( "Biomarker\n",subClust$gene) 
  results <- Seurat::FindGeneTerms(QueryGene = subClust$gene) 
  print(results) #parse_html_notebook(results)
  cat('\n')
}
## Error in eval(expr, envir, enclos): object 'top1' not found

Biomarkers tSNE

fp <- FeaturePlot(object = DAT, features.plot = top1$gene, cols.use = c("grey", "purple"), 
    reduction.use = "tsne", nCol = nCols, do.return = T)
## Error in FeaturePlot(object = DAT, features.plot = top1$gene, cols.use = c("grey", : object 'nCols' not found

Biomarkers Heatmap

top5 <- DAT.markers %>% group_by(cluster) %>% top_n(5, avg_logFC)
## Error in grouped_df_impl(data, unname(vars), drop): Column `cluster` is unknown
# setting slim.col.label to TRUE will print just the cluster IDS instead of
# every cell name
DoHeatmap(object = DAT, genes.use = top5$gene, slim.col.label=T, remove.key=T) 
## Error in SetIfNull(x = genes.use, default = rownames(x = data.use)): object 'top5' not found
# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p))

Biomarkers Ridgeplot

RidgePlot(DAT, features.plot = top1$gene,  nCol = nCols, do.sort = F)
## Error in RidgePlot(DAT, features.plot = top1$gene, nCol = nCols, do.sort = F): object 'nCols' not found

Biomarkers Split Dot Plot

Visualize biomarker expression for each cluster, by disease

top2 <- DAT.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
## Error in grouped_df_impl(data, unname(vars), drop): Column `cluster` is unknown
sdp <- SplitDotPlotGG(DAT, genes.plot = top2$gene, cols.use = c("blue","red"), 
                      x.lab.rot = T, plot.legend = T, dot.scale = 8, do.return = T, grouping.var = "dx")
## Error in vars.all %in% rownames(object@data): object 'top2' not found

Map Clusters to Known Biomarkers

  • Known Monocytes Biomarkers
    • Classical: CD14++ / CD16–
    • Intermediate: CD14++ / CD16+
    • Nonclassical: CD14+ / CD16++ (not captured in this data)

The following plots show the absolute expression of each biomarker, as opposed to avg_logFC which is dependent on the expression patterns of other cell types being compared.

Markers Dataframe

markerList <- c("CD14", "FCGR3A") 
 
get_markerDF <- function(DAT, markerList, meta_vars =c("barcode", "dx", "mut","post_clustering", "percent.mito","nGene", "nUMI")){
  exp <- DAT@scale.data %>% data.frame() 
  marker.matrix <- exp[row.names(exp) %in% markerList, ] 
  marker.matrix$Gene <- row.names(marker.matrix)
  markerMelt <- reshape2:::melt.data.frame(marker.matrix, id.vars = "Gene", variable.name = "Cell",value.name = "Expression") 
  metaSelect <-  DAT@meta.data[,meta_vars] 
  markerDF <- merge(markerMelt,metaSelect, by.x="Cell", by.y="barcode") 
  return(markerDF)
}
markerDF <- get_markerDF(DAT, markerList)
createDT(markerDF, caption = "Known Marker Expression")
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Marker ANOVAs + Boxplots

# Explore expression differences between groups
marker_vs_metadata <- function(markerDF, meta_var){ 
  # Create title from ANOVA summary
  ANOVAtitle <- function(markerDF, marker){
      nTests <- length(unique(markerDF$Gene))
      res <- anova(lm(data = subset(markerDF, Gene==marker), 
                      formula = Expression ~ eval(parse(text=meta_var))))
      
      title <-paste(paste("ANOVA (",marker, " vs. ",meta_var, ")", sep=""), 
                    ": p=",round(res$`Pr(>F)`,3), 
                    ", F=",round(res$`F value`,3), 
        ifelse(res$`Pr(>F)`<.05/nTests,"(Significant**)",
               "(Non-significant)") ) 
  }
  
  title = ""
  for (marker in unique(markerDF$Gene) ){
    cat(marker)
    title <- paste(title, "\n", ANOVAtitle(markerDF, marker))
  } 
   
 
  ggplot(markerDF, aes(x=eval(parse(text=meta_var)), y=Expression, fill= Gene)) + 
    geom_violin() + 
      geom_point( position=position_jitterdodge(jitter.width = .2, dodge.width = .9 ),   alpha=0.6, color="turquoise3") + 
    labs(title = title, x=meta_var) +
    theme(plot.title = element_text( size=10)) +
    scale_fill_manual(values=c("brown", "slategray"))
  
}

ANOVA: dx

marker_vs_metadata(markerDF, "dx")
## FCGR3ACD14

ANOVA: mut

marker_vs_metadata(markerDF, "mut")
## FCGR3ACD14

Identify Cell Types with Garnett + Monocle

Pre-trained PBMC Classifer

# load("DAT.R")    
# Convert Seurat objt to CDS object
mDAT <- monocle::importCDS(DAT,  import_all = T) 
# generate size factors for normalization later
mDAT <- estimateSizeFactors(mDAT)
# Get pre-trained PBMC classifer 
load("./Garnet/hsPBMC") # Download from: https://cole-trapnell-lab.github.io/garnett/classifiers/hsPBMC 
# Get feature genes for each cell type
feature_genes <- get_feature_genes(classifier = hsPBMC,
                                   node = "root",
                                   db = org.Hs.eg.db, 
                                   convert_ids = T)
head(feature_genes)
mDAT <- classify_cells(mDAT, hsPBMC,
                           db = org.Hs.eg.db,
                           cluster_extend = TRUE,
                           cds_gene_id_type = "SYMBOL")
head(pData(mDAT))
table(pData(mDAT)$cell_type)
table(pData(mDAT)$cluster_ext_type) 



# Run tSNE: Plot Clusters and Cell Types 
mDAT <- reduceDimension(mDAT, max_components = 3, reduction_method = "tSNE") 
commonGeoms <- labs(x="tSNE1",y="tSNE2")

plot_grid(nrow = 2,
  qplot(data = pData(mDAT), mDAT@reducedDimA[1,], mDAT@reducedDimA[2,], color = cell_type) + theme_bw() + commonGeoms,
  qplot(data = pData(mDAT), mDAT@reducedDimA[1,], mDAT@reducedDimA[2,], color = cluster_ext_type) + theme_bw() + commonGeoms
)


# Unsupervised Clustering
mDAT <- clusterCells(mDAT, num_clusters = 5)
pData(mDAT)
plot_cell_clusters(mDAT, 1, 2, color = "Cluster",  markers = c("CD14", "FCGR3A"))
plot_cell_clusters(mDAT, 1, 2, color = "Cluster") + facet_wrap(~dx)
plot_cell_clusters(mDAT, 1, 2, color = "Cluster") + facet_wrap(~mut)
plot_cell_clusters(mDAT, 1, 2, color = "Cluster") + facet_wrap(~cell_type) 

DAT <- Seurat::AddMetaData(DAT, pData(mDAT)[c("garnett_cluster","cell_type","cluster_ext_type","Cluster")])

Train Classifier on Data

Pseudo-time

WARNING: Very computationally expensive!

mDAT <- reduceDimension(mDAT, max_components = 2, method = 'DDRTree')
mDAT <- orderCells(mDAT)
plot_cell_trajectory(mDAT, color_by = "dx")
plot_cell_trajectory(mDAT, color_by = "cell_type")
plot_cell_trajectory(HSMM_myo, color_by = "cell_type") +
    facet_wrap(~mut, nrow = 2)

Known Biomarkers: Heatmaps

Cells Separated

markerDF <- get_markerDF(DAT, markerList, 
             meta_vars =c("barcode", "dx", "mut","ID","post_clustering", "percent.mito","nGene", "nUMI" )) #cell_type
Spectral <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(length(unique(DAT@meta.data$mut)), "Spectral"))

# DAT <- DoKMeans(DAT, k.genes = 3) 
# KMeansHeatmap(DAT)

if (T==F){
  # Spectral <- heatmaply::Spectral(length(unique(DAT@meta.data$mut)))
  markerMelt <- reshape2::acast(markerDF, Cell~Gene, value.var="Expression", fun.aggregate = mean, drop = F, fill = 0) 
  heatmaply::heatmaply(markerMelt,  key.title="Expression",#plot_method= "ggplot",
        k_row = dim(markerMelt)[2], dendrogram = "row",
        showticklabels = c(T, F), xlab = "Known Markers", ylab = "Cells", column_text_angle = 45, 
        row_side_colors =  DAT@meta.data[,c("dx","mut")], #cell_type
        row_side_palette = Spectral
        )  %>%  colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 2)  %>% 
        colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 1)
 }else{ 
  # markerDF_sub <-subset(markerDF, Gene==markerList[1])  
  # var_to_colors(markerDF_sub, "post_clustering")  
  # library(pheatmap)
  # pheatmap(markerMelt, annotation_row = markerDF_sub[c("dx","mut","cell_type")])
  # pheatmap(markerMelt, kmeans_k = NA, annotation_row = markerDF_sub[c("dx","mut","cell_type")],
  #         cluster_cols = F, cutree_rows = length(unique(markerDF$post_clustering)),  angle_col=45 )
  library(RColorBrewer) 
  var_to_colors <- function(markerDF, metaVar){
    colors <- brewer.pal(length(unique(markerDF[metaVar]) ), "Dark2")
     sample(colors, length(unique(markerDF[metaVar])), replace = TRUE, prob = NULL)
    # metaColors <- colors[ subset(markerDF, Gene==markerList[1])[metaVar][,1] %>% as.factor() ]
    return(metaColors)
  }  
  # library(GMD)
  # myCols = cbind(var_to_colors(markerDF, "dx"), var_to_colors(markerDF, "mut")) 
  # rlab=t(cbind(
  #   var_to_colors(markerDF, "post_clustering"),
  #   var_to_colors(markerDF, "dx")
  #   ))
  #   heatmap.2(marker.matrix, key.title="Expression",  col = viridis(300), trace="none",Colv = F, Rowv = F,
  #             labRow = F, xlab = "Biomarker", ylab="Cell", cexCol=1, RowSideColors = var_to_colors(markerDF, "post_clustering")
  #             )
  # heatmap.3(markerMelt, dendrogram = 'row', kr = length(unique(markerDF)), labRow = F, 
  #           xlab = "Biomarker", ylab = "Cell", RowSideColors = rlab, RowSideColorsSize=2 )
   
  
  
  markerDF <- markerDF %>%    
    mutate_at(vars(post_clustering, dx, mut, ID), as.factor) %>% #cell_type
    mutate(Cluster = post_clustering) %>%
    arrange(post_clustering) 
  # markerMelt <- reshape2::acast(markerDF, Cell~Gene, value.var="Expression", fun.aggregate = mean, drop = F, fill = 0) 
  markerMelt <- dcast(markerDF,  Cell + post_clustering + dx + mut + ID  ~ Gene, #+ cell_type
                      fun.aggregate = mean, value.var = "Expression") %>% arrange(post_clustering)
  marker.matrix <- markerMelt[markerList] %>%as.matrix()
  row.names(marker.matrix) <- markerMelt$Cell
  
  ha = HeatmapAnnotation(df = markerDF[c("dx","mut","ID","post_clustering")], which = "row")  #cell_type
  
  ComplexHeatmap::Heatmap(marker.matrix, col=viridis(300), column_title = "Biomarker", row_title = "Cell",  
                          row_dend_reorder = F,show_row_names = F, show_column_dend = F,show_row_dend =T,
                          cluster_rows = T, column_title_side = "bottom",km = length(unique(markerMelt$post_clustering))) + ha
 

} 

Average Expression: By Clusters

markerDF <- markerDF %>% mutate(Cluster = post_clustering)
# Show mean exp for each marker
avgMarker <- markerDF %>% group_by(Gene, Cluster) %>% summarise(meanExp = mean(Expression)) 

p <- ggplot(data = avgMarker, aes(x=Gene, y=Cluster, fill=meanExp)) %>% + geom_tile() + scale_fill_viridis()
p

# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p))

Average Expression: By Disease

# Show mean exp for each marker
avgMarker <- markerDF %>% group_by(Gene, dx, Cluster) %>% summarise(meanExp = mean(Expression)) 
p <- ggplot(data = avgMarker, aes(x=Gene, y=dx, fill=meanExp)) %>% + geom_tile() + scale_fill_viridis()
p 

# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p))

Known Biomarkers: Boxplot

ggplot(data = markerDF, aes(x=Cluster, y=Expression, fill=Gene)) %>% 
  + geom_boxplot(alpha=0.5) %>% + scale_fill_manual(values=c("purple", "turquoise"))  

Known Biomarkers: tSNE

#, results = 'hide', fig.show='hide'
expressionTSNE <- function(DAT, marker, colors=c("grey", "red")){
   FeaturePlot(object = DAT, features.plot = marker, cols.use = colors, 
    reduction.use = "tsne", nCol=2, do.return = T, dark.theme = T)[[1]]
  # p <- ifelse(interactive, p %>% ggplotly() %>% toWebGL(), print(p)) 
}
 plot_grid(expressionTSNE(DAT, markerList[1]),
 expressionTSNE(DAT, markerList[2], colors=c("grey", "green")))
## Error in GetDimReduction(object = object, reduction.type = reduction.use, : tsne  dimensional reduction has not been computed

Label Clusters by DGE Biomarkers

current.cluster.ids <- unique(DAT.markers$cluster) #c(0, 1, 2, 3, 4, 5, 6, 7)
top1 <- DAT.markers %>% group_by(cluster) %>% top_n(1, avg_logFC)
## Error in grouped_df_impl(data, unname(vars), drop): Column `cluster` is unknown
new.cluster.ids <- top1$gene #c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")
## Error in eval(expr, envir, enclos): object 'top1' not found
DAT@ident <- plyr::mapvalues(x = DAT@ident, from = current.cluster.ids, to = new.cluster.ids)
## Error in plyr::mapvalues(x = DAT@ident, from = current.cluster.ids, to = new.cluster.ids): object 'new.cluster.ids' not found
TSNEPlot(object=DAT, do.label=T, pt.size=0.5, do.return=T) 
## Error in GetDimReduction(object = object, reduction.type = reduction.use, : tsne  dimensional reduction has not been computed
# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p)) 

DGE: All Cells

  • DGE methods available in Seurat include:
  • DESeq2DETest()
  • DiffExpTest()
  • DiffTTest()
# Available DGE methods:
## "wilcox", "bimod", "roc", "t", "tobit", "poisson", "negbinom", "MAST", "DESeq2"
runDGE <- function(DAT, meta_var, group1, group2, test.use="wilcox"){
  #print(paste("DGE_allCells",meta_var,sep="_")) 
  DAT <- SetAllIdent(DAT, id = meta_var)
  DAT <- StashIdent(DAT, save.name = meta_var)  
  DEGs <- FindMarkers(DAT, ident.1=group1, ident.2=group2, test.use=test.use, only.pos = F)
  DEGs$gene <- row.names(DEGs)
  
  cap <- paste("DEGs:\n",group1, "vs.", group2)
  createDT_html(DEG_df, caption = cap)
  volcanoPlot(DEG_df, caption = cap)
  
  DAT <- SetAllIdent(DAT, id = "post_clustering")
  return(DEGs)
}

PD vs. Controls

DEG_df <-runDGE(DAT, "dx", group1 = "PD", group2="control")
## Error in crosstalk::is.SharedData(data): object 'DEG_df' not found

LRRK vs. PD

DEG_df <-runDGE(DAT, "mut", "LRRK2", "PD")
## Error in createDT_html(DEG_df, caption = cap): object 'DEG_df' not found

cell_type

DEG_df <-runDGE(DAT, "cell_type", "Monocytes", "Dendritic cells") 

DGE: Within Clusters

DGE_within_clusters <- function(DAT, meta_var, group1, group2){ 
  for (clust in unique(DAT@meta.data$post_clustering)){
    # Subset cells by cluster  
   cat('\n')   
   cat("### ",paste("Cluster ",clust,": ",group1," vs. ", group2, sep="") , "\n")
   DAT_clustSub <- Seurat::SubsetData(DAT, accept.value = clust, subset.raw = T)  
   DEG_df <-runDGE(DAT_clustSub, meta_var, group1, group2 ) 
   cat('\n')   
  } 
}

Between Disease Groups

DGE_within_clusters(DAT, "dx", "PD", "control")    

Cluster RAJ_13357: PD vs. control

## Error in createDT_html(DEG_df, caption = cap): object 'DEG_df' not found

Between Mutation Groups

DGE_within_clusters(DAT, "mut", "LRRK2", "PD")

Cluster RAJ_13357: LRRK2 vs. PD

## Error in createDT_html(DEG_df, caption = cap): object 'DEG_df' not found

Between cell_type

DGE_within_clusters(DAT, "cell_type", "Monocytes", "Dendritic cells") 
DAT@meta.data$cell_type %>% table()

Try Different Cluster Resolutions

If you perturb some of our parameter choices above (for example, setting resolution=0.8 or changing the number of PCs), you might see the CD4 T cells subdivide into two groups. You can explore this subdivision to find markers separating the two T cell subsets. However, before reclustering (which will overwrite object@ident), we can stash our renamed identities to be easily recovered later.

Find New Clusters

new_resolution <- 3.0
orig_resolution <- paste("resolution",resolution,sep="_")
DAT <- StashIdent(object = DAT, save.name = orig_resolution) 

## Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
## = reduction.type, : Build parameters exactly match those of already
## computed and stored SNN. To force recalculation, set force.recalc to TRUE.
DAT <- FindClusters(object = DAT, reduction.type = "pca", dims.use = 1:10,
                     resolution = new_resolution, print.output = F)
DAT <- StashIdent(object = DAT, save.name = "resolution_3.0") 

plot1 <- TSNEPlot(object = DAT, do.return = TRUE, no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot2 <- TSNEPlot(object = DAT, do.return = TRUE, group.by = "post_clustering", 
                  no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot_grid(plot1, plot2)

Find New Biomarkers

res3.0_markers <- FindAllMarkers(object = DAT, min.pct = 0.25, thresh.use = 0.25,  only.pos = F,  test.use = "wilcox")
top1_res3.0 <- res3.0_markers %>% group_by(cluster) %>% top_n(1, avg_logFC) 

FeaturePlot(object = DAT, features.plot = top1_res3.0$gene, cols.use = c("green", "blue"))

# Set back to orig
DAT <- SetAllIdent(object = DAT, id = orig_resolution) 

Save Results

save.image(file.path(resultsPath, "scRNAseq_results.RData"))